In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',None)
In [2]:
# Reading csv file
df = pd.read_csv('Chennai houseing sale.csv')
df.head(5)
Out[2]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 P03210 Karapakkam 1004 04-05-2011 131 1.0 1.0 3 AbNormal Yes 15-05-1967 Commercial AllPub Paved A 4.0 3.9 4.9 4.330 380000 144400 7600000
1 P09411 Anna Nagar 1986 19-12-2006 26 2.0 1.0 5 AbNormal No 22-12-1995 Commercial AllPub Gravel RH 4.9 4.2 2.5 3.765 760122 304049 21717770
2 P01812 Adyar 909 04-02-2012 70 1.0 1.0 3 AbNormal Yes 09-02-1992 Commercial ELO Gravel RL 4.1 3.8 2.2 3.090 421094 92114 13159200
3 P05346 Velachery 1855 13-03-2010 14 3.0 2.0 5 Family No 18-03-1988 Others NoSewr Paved I 4.7 3.9 3.6 4.010 356321 77042 9630290
4 P06210 Karapakkam 1226 05-10-2009 84 1.0 1.0 3 AbNormal Yes 13-10-1979 Others AllPub Gravel C 3.0 2.5 4.1 3.290 237000 74063 7406250
In [3]:
# Information about the Data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PRT_ID         7109 non-null   object 
 1   AREA           7109 non-null   object 
 2   INT_SQFT       7109 non-null   int64  
 3   DATE_SALE      7109 non-null   object 
 4   DIST_MAINROAD  7109 non-null   int64  
 5   N_BEDROOM      7108 non-null   float64
 6   N_BATHROOM     7104 non-null   float64
 7   N_ROOM         7109 non-null   int64  
 8   SALE_COND      7109 non-null   object 
 9   PARK_FACIL     7109 non-null   object 
 10  DATE_BUILD     7109 non-null   object 
 11  BUILDTYPE      7109 non-null   object 
 12  UTILITY_AVAIL  7109 non-null   object 
 13  STREET         7109 non-null   object 
 14  MZZONE         7109 non-null   object 
 15  QS_ROOMS       7109 non-null   float64
 16  QS_BATHROOM    7109 non-null   float64
 17  QS_BEDROOM     7109 non-null   float64
 18  QS_OVERALL     7061 non-null   float64
 19  REG_FEE        7109 non-null   int64  
 20  COMMIS         7109 non-null   int64  
 21  SALES_PRICE    7109 non-null   int64  
dtypes: float64(6), int64(6), object(10)
memory usage: 1.2+ MB
In [4]:
# Changing Date Column to Datetime Datatype
df['DATE_SALE'] = pd.to_datetime(df['DATE_SALE'], format= '%d-%m-%Y')
df['DATE_BUILD'] = pd.to_datetime(df['DATE_BUILD'], format= '%d-%m-%Y')
df['PRT_ID'] = df['PRT_ID'].str[1:].astype('int64')
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PRT_ID         7109 non-null   int64         
 1   AREA           7109 non-null   object        
 2   INT_SQFT       7109 non-null   int64         
 3   DATE_SALE      7109 non-null   datetime64[ns]
 4   DIST_MAINROAD  7109 non-null   int64         
 5   N_BEDROOM      7108 non-null   float64       
 6   N_BATHROOM     7104 non-null   float64       
 7   N_ROOM         7109 non-null   int64         
 8   SALE_COND      7109 non-null   object        
 9   PARK_FACIL     7109 non-null   object        
 10  DATE_BUILD     7109 non-null   datetime64[ns]
 11  BUILDTYPE      7109 non-null   object        
 12  UTILITY_AVAIL  7109 non-null   object        
 13  STREET         7109 non-null   object        
 14  MZZONE         7109 non-null   object        
 15  QS_ROOMS       7109 non-null   float64       
 16  QS_BATHROOM    7109 non-null   float64       
 17  QS_BEDROOM     7109 non-null   float64       
 18  QS_OVERALL     7061 non-null   float64       
 19  REG_FEE        7109 non-null   int64         
 20  COMMIS         7109 non-null   int64         
 21  SALES_PRICE    7109 non-null   int64         
dtypes: datetime64[ns](2), float64(6), int64(7), object(7)
memory usage: 1.2+ MB
In [6]:
df.head(2)
Out[6]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 3210 Karapakkam 1004 2011-05-04 131 1.0 1.0 3 AbNormal Yes 1967-05-15 Commercial AllPub Paved A 4.0 3.9 4.9 4.330 380000 144400 7600000
1 9411 Anna Nagar 1986 2006-12-19 26 2.0 1.0 5 AbNormal No 1995-12-22 Commercial AllPub Gravel RH 4.9 4.2 2.5 3.765 760122 304049 21717770
In [7]:
# Size of the data
df.shape
Out[7]:
(7109, 22)
In [8]:
# Checking for Null Values
df.isnull().sum()
Out[8]:
PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         1
N_BATHROOM        5
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       48
REG_FEE           0
COMMIS            0
SALES_PRICE       0
dtype: int64
In [9]:
# Replacing the Null Values in 'QS_OVERALL' column by the Average of 'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM'.
df['QS_OVERALL'] = df['QS_OVERALL'].fillna(round(df[['QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM']].mean(axis=1), 2))
In [10]:
# Droping Null Values
df.dropna(inplace=True)
In [11]:
# Checking if Null Values are gone
df.isnull().sum()
Out[11]:
PRT_ID           0
AREA             0
INT_SQFT         0
DATE_SALE        0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACIL       0
DATE_BUILD       0
BUILDTYPE        0
UTILITY_AVAIL    0
STREET           0
MZZONE           0
QS_ROOMS         0
QS_BATHROOM      0
QS_BEDROOM       0
QS_OVERALL       0
REG_FEE          0
COMMIS           0
SALES_PRICE      0
dtype: int64
In [12]:
df.shape
Out[12]:
(7103, 22)
In [13]:
# Changing datatype to int
df['N_BEDROOM'] = df['N_BEDROOM'].fillna(0).astype('int64')
df['N_BATHROOM'] = df['N_BATHROOM'].fillna(0).astype('int64')
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7103 entries, 0 to 7108
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PRT_ID         7103 non-null   int64         
 1   AREA           7103 non-null   object        
 2   INT_SQFT       7103 non-null   int64         
 3   DATE_SALE      7103 non-null   datetime64[ns]
 4   DIST_MAINROAD  7103 non-null   int64         
 5   N_BEDROOM      7103 non-null   int64         
 6   N_BATHROOM     7103 non-null   int64         
 7   N_ROOM         7103 non-null   int64         
 8   SALE_COND      7103 non-null   object        
 9   PARK_FACIL     7103 non-null   object        
 10  DATE_BUILD     7103 non-null   datetime64[ns]
 11  BUILDTYPE      7103 non-null   object        
 12  UTILITY_AVAIL  7103 non-null   object        
 13  STREET         7103 non-null   object        
 14  MZZONE         7103 non-null   object        
 15  QS_ROOMS       7103 non-null   float64       
 16  QS_BATHROOM    7103 non-null   float64       
 17  QS_BEDROOM     7103 non-null   float64       
 18  QS_OVERALL     7103 non-null   float64       
 19  REG_FEE        7103 non-null   int64         
 20  COMMIS         7103 non-null   int64         
 21  SALES_PRICE    7103 non-null   int64         
dtypes: datetime64[ns](2), float64(4), int64(9), object(7)
memory usage: 1.2+ MB
In [15]:
# lets see the unique values of columns
for cols in df.columns:
    if df[cols].dtype == object:
        print()
        print(cols)
        print(df[cols].unique())
AREA
['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chrompet' 'KK Nagar'
 'TNagar' 'T Nagar' 'Chrompt' 'Chrmpet' 'Karapakam' 'Ana Nagar' 'Chormpet'
 'Adyr' 'Velchery' 'Ann Nagar' 'KKNagar']

SALE_COND
['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale' 'Ab Normal'
 'Partiall' 'Adj Land' 'PartiaLl']

PARK_FACIL
['Yes' 'No' 'Noo']

BUILDTYPE
['Commercial' 'Others' 'Other' 'House' 'Comercial']

UTILITY_AVAIL
['AllPub' 'ELO' 'NoSewr ' 'NoSeWa' 'All Pub']

STREET
['Paved' 'Gravel' 'No Access' 'Pavd' 'NoAccess']

MZZONE
['A' 'RH' 'RL' 'I' 'C' 'RM']

There seems to be some misspells in the Dataset¶

In [16]:
# Cleaning the Data 
df.AREA.replace(['Ana Nagar','Ann Nagar'],'Anna Nagar',inplace=True)
df.AREA.replace('Karapakam','Karapakkam',inplace=True)
df.AREA.replace(['Chrompt','Chrmpet','Chormpet','Chrompet'],'Chromepet',inplace=True)
df.AREA.replace('KKNagar','KK Nagar',inplace=True)
df.AREA.replace('TNagar','T Nagar',inplace=True)
df.AREA.replace('Adyr','Adyar',inplace=True)
df.AREA.replace('Velchery','Velachery',inplace=True)
df.BUILDTYPE.replace('Comercial','Commercial',inplace=True)
df.BUILDTYPE.replace('Other','Others',inplace=True)
df.UTILITY_AVAIL.replace('All Pub','AllPub',inplace=True)
df.SALE_COND.replace('Ab Normal','AbNormal',inplace=True)
df.SALE_COND.replace(['PartiaLl','Partiall'],'Partial',inplace=True)
df.SALE_COND.replace('Adj Land','AdjLand',inplace=True)
df.PARK_FACIL.replace('Noo','No',inplace=True)
df.STREET.replace('Pavd','Paved',inplace=True)
df.STREET.replace('NoAccess','No Access',inplace=True)
In [17]:
# Checking if the changes has been applied
for cols in df.columns:
    if df[cols].dtype == object:
        print()
        print(cols)
        print(df[cols].unique())
AREA
['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chromepet' 'KK Nagar'
 'T Nagar']

SALE_COND
['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale']

PARK_FACIL
['Yes' 'No']

BUILDTYPE
['Commercial' 'Others' 'House']

UTILITY_AVAIL
['AllPub' 'ELO' 'NoSewr ' 'NoSeWa']

STREET
['Paved' 'Gravel' 'No Access']

MZZONE
['A' 'RH' 'RL' 'I' 'C' 'RM']
In [18]:
df.head(3)
Out[18]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 3210 Karapakkam 1004 2011-05-04 131 1 1 3 AbNormal Yes 1967-05-15 Commercial AllPub Paved A 4.0 3.9 4.9 4.330 380000 144400 7600000
1 9411 Anna Nagar 1986 2006-12-19 26 2 1 5 AbNormal No 1995-12-22 Commercial AllPub Gravel RH 4.9 4.2 2.5 3.765 760122 304049 21717770
2 1812 Adyar 909 2012-02-04 70 1 1 3 AbNormal Yes 1992-02-09 Commercial ELO Gravel RL 4.1 3.8 2.2 3.090 421094 92114 13159200

The Dataset is Cleaned and Ready for Analysis¶

In [19]:
# Stastical Description of Data
df.describe()
Out[19]:
PRT_ID INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM DATE_BUILD QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
count 7103.000000 7103.000000 7103 7103.000000 7103.000000 7103.00000 7103.000000 7103 7103.000000 7103.000000 7103.000000 7103.000000 7103.000000 7103.000000 7.103000e+03
mean 5034.853865 1382.117556 2010-04-06 11:12:03.243699968 99.578629 1.637336 1.21329 3.688723 1985-12-28 20:14:21.607771392 3.517753 3.507377 3.485513 3.504131 376952.707588 141055.364212 1.089477e+07
min 1.000000 500.000000 2004-01-16 00:00:00 0.000000 1.000000 1.00000 2.000000 1949-10-28 00:00:00 2.000000 2.000000 2.000000 2.000000 71177.000000 5055.000000 2.156875e+06
25% 2536.500000 993.000000 2008-10-17 00:00:00 50.000000 1.000000 1.00000 3.000000 1976-08-20 12:00:00 2.700000 2.700000 2.700000 3.130000 272469.000000 84252.500000 8.270900e+06
50% 5085.000000 1373.000000 2010-05-03 00:00:00 99.000000 1.000000 1.00000 4.000000 1986-10-13 00:00:00 3.500000 3.500000 3.500000 3.500000 349509.000000 127644.000000 1.033464e+07
75% 7513.500000 1744.000000 2011-09-15 00:00:00 148.000000 2.000000 1.00000 4.000000 1996-07-01 00:00:00 4.300000 4.300000 4.300000 3.890000 451544.500000 184534.000000 1.299422e+07
max 10034.000000 2500.000000 2015-02-17 00:00:00 200.000000 4.000000 2.00000 6.000000 2010-11-12 00:00:00 5.000000 5.000000 5.000000 4.970000 983922.000000 495405.000000 2.366734e+07
std 2885.722305 457.484517 NaN 57.399230 0.803030 0.40966 1.019299 NaN 0.891931 0.898042 0.887067 0.527168 143088.109006 78775.875689 3.769549e+06
In [20]:
df.AREA.value_counts()
Out[20]:
AREA
Chromepet     1699
Karapakkam    1366
KK Nagar       997
Velachery      981
Anna Nagar     785
Adyar          774
T Nagar        501
Name: count, dtype: int64
In [21]:
# Distribution of Houses in various Areas
px.pie(df.groupby('AREA',as_index=False)['PRT_ID'].count(), values='PRT_ID', names='AREA',
       title='<b> Number of houses in various areas of Chennai</b>', labels={'PRT_ID':'Count'},
       color_discrete_sequence=px.colors.sequential.Plasma, hole=.5)

Insights :¶

  • Chrompet has the highest number of houses among all areas in Chennai.
  • T Nagar has a lower number of houses compared to other areas in Chennai.
In [22]:
px.box(df, x='AREA', y='SALES_PRICE', title= '<b>Sales Price of houses in various areas of Chennai',
    color= 'AREA')

Insights :¶

  • T Nagar and Anna Nagar areas have the highest selling prices for houses, while KK Nagar comes in Third.
  • Houses in Karapakam area have lower selling prices compared to houses in other areas.
In [23]:
    px.scatter(df,x= 'INT_SQFT', y= 'SALES_PRICE', color= 'AREA', size= 'INT_SQFT',
               title= '<b> SQFT Versus SALES PRICE of Houses in Various Areas', labels= {'INT_SQFT': 'SQFT'})

Insights :¶

  • An increase in the square footage of houses is associated with an increase in the sales price of houses.
  • Houses in the same area tend to have similar square footage.
  • Houses in T Nagar and Anna Nagar areas have the highest prices, with square footage typically ranging from 1500 to 2000.
  • KK Nagar area houses have Higher square foot, typically ranging from 1400 to 2500.
In [24]:
px.scatter(df, x='DIST_MAINROAD', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
           title='<b> MAINROAD DISTANCE Versus SALES PRICE of Houses in Various Areas',
           labels= {'DIST_MAINROAD':'Distance from Mainroad'})

Insights :¶

  • It appears that the distance to the main road does not significantly affect the sales price of houses.
  • Houses with both shorter and longer distances to the main road have similar prices, indicating that the main road distance does not have a strong impact on the sales price.
In [25]:
fig=px.scatter(df, x='DATE_BUILD', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
               title='<b> Build Date Vs Sales Price')
fig.show()
fig=px.scatter(df, x='DATE_SALE', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
               title='<b> Sale Date Vs Sales Price')
fig.show()

Insights :¶

  • The columns 'DATE_BUILD' and 'DATE_SALE' do not appear to have a significant impact on the 'SALES_PRICE' of properties.
  • This suggests that the dates when the properties were built and sold do not directly influence their selling prices.
In [26]:
px.box(df, x='N_ROOM', y='SALES_PRICE', color='AREA', labels= {'N_ROOM':'No.of Rooms'},
       title='<b> Total Rooms Versus Sales Price of Houses in Various Areas')

Insights:¶

  • There is a positive correlation between the number of rooms ('N_ROOM') and the sales price ('SALES_PRICE'). As the number of rooms increases, the sales price also tends to increase.

  • The majority of houses in the dataset have 4 to 5 rooms, indicating that this range is common among the properties.

  • Only KK Nagar has houses with 6 rooms. This aligns with the previous insight showing that KK Nagar has the highest square footage houses, suggesting that larger houses with more rooms are more common in KK Nagar.

  • Anna Nagar and T Nagar have houses predominantly with 4 to 5 rooms. Despite this, they have higher sales prices, indicating that these areas are desirable and command higher prices despite having fewer properties with more rooms.

In [27]:
px.box(df, x='N_BEDROOM', y='SALES_PRICE', color='AREA', labels= {'N_BEDROOM':'No.of Bedrooms'},
       title='<b>Total BedRooms Versus Sales Price of Houses in Various Areas')

Insights:¶

  • There is a positive correlation between the number of bedrooms ('N_BEDROOM') and the sales price ('SALES_PRICE'). As the number of bedrooms increases, the sales price also tends to increase.

  • While an increase in the number of bedrooms tends to increase the sales price, the sales are directly affected by the area where the property is located.

  • The majority of houses in the dataset have 1 to 2 bedrooms, indicating that this range is common among the properties.

  • KK Nagar and Velachery are the only areas with 3-bedroom houses, but KK Nagar also has 4-bedroom houses, indicating a wider range of housing options in KK Nagar.

  • Anna Nagar and T Nagar have houses predominantly with 1 to 2 bedrooms. Despite this, they have higher sales prices, suggesting that the area itself has a direct effect on the sales price, regardless of the number of bedrooms in the properties.

In [28]:
px.box(df,x='N_BATHROOM', y='SALES_PRICE', color='AREA', labels= {'N_BATHROOM':'No.of Bathrooms'},
       title='<b>Total BathRooms Versus Sales Price of Houses in Various Areas')

Insights:¶

  • There is a positive correlation between the number of bathrooms ('N_BATHROOM') and the sales price ('SALES_PRICE'). As the number of bathrooms increases, the sales price also tends to increase.

  • More than half of the houses in the dataset have 1 bathroom, indicating that this is the most common number of bathrooms among the properties.

  • Anna Nagar and T Nagar have houses with only a single bathroom. Despite this, they have higher sales prices, suggesting that these areas are desirable and command higher prices despite having fewer bathrooms in the properties.

In [29]:
px.box(df, x='SALE_COND', y='SALES_PRICE', color='AREA', labels= {'SALE_COND':'Sale Condition'},
       title='<b>Sale Condition Versus Sales Price of Houses in various Areas')

Insights:¶

  • There doesn't seem to be much difference in sales price based on sale condition.
  • The sale condition doesn't appear to have a significant impact on the sales price of properties.
In [30]:
px.box(df, x='PARK_FACIL', y='SALES_PRICE', color='AREA', labels= {'PARK_FACIL':'Parking Facility'},
       title='<b>Parking Facility Versus Sales Price of Houses in Different Areas')

Insight:¶

  • Houses with a parking facility tend to have slightly higher prices across different areas.
  • This suggests that having a parking facility is a desirable feature that can positively impact the sales price of a house.
In [31]:
px.box(df, x='BUILDTYPE', y='SALES_PRICE', color='AREA',
       title='<b>Build Type Versus Sales Price of Houses in Different Areas')

Insights:¶

  • Commercial houses appear to have significantly higher prices compared to residential houses and other property types in different areas.
In [32]:
px.box(df, x='UTILITY_AVAIL', y='SALES_PRICE', color='AREA',
       title='<b> Utility Available Versus Sales Price of Houses in Different Areas')

Insight:¶

  • Across different utility availability types there isn't a significant difference in sales prices.
  • This suggests that in this dataset utility availability may not be a major factor affecting property sales prices.
In [33]:
px.box(df, x='STREET', y='SALES_PRICE', color='AREA',
       title='<b> Street Versus Sales Price of Houses in Different Areas')

Insight:¶

  • There isn't much difference in sales price between properties located on a Paved street and those on a Gravel street.
  • Properties with no street access have lower sales prices compared to properties on both Paved and Gravel streets.
In [34]:
px.box(df, x='MZZONE', y='SALES_PRICE', color='AREA',
       title='<b> MZZONE Versus Sales Price of Houses in Different Areas')

Insight:¶

  • Houses in RM (Residential Medium Density) zones tend to have the highest sales prices among all zones, followed by RL (Residential Low Density) and RH (Residential High Density) zones.

  • Houses in A (Agricultural) zones tend to have lower sales prices compared to all, typically indicates areas designated for agricultural use.

  • The areas 'Anna Nagar', 'Chromepet', 'KK Nagar', and 'T Nagar' only have houses in residential zones, indicating that these areas are primarily residential in nature.

In [35]:
numeric_columns = ['INT_SQFT', 'DIST_MAINROAD', 'N_BEDROOM', 'N_BATHROOM', 'N_ROOM', 'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM', 'QS_OVERALL', 'REG_FEE', 'COMMIS', 'SALES_PRICE']
numeric_df = df[numeric_columns]

corr_matrix = numeric_df.corr()
numeric_df.corr()
Out[35]:
INT_SQFT DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
INT_SQFT 1.000000 0.001973 0.786392 0.515433 0.951271 0.019720 -0.008168 0.008385 0.012163 0.657435 0.571125 0.612026
DIST_MAINROAD 0.001973 1.000000 -0.002644 0.002074 0.002136 0.001762 -0.029529 0.001890 -0.016771 0.011608 0.011218 0.018679
N_BEDROOM 0.786392 -0.002644 1.000000 0.755214 0.840357 0.014750 -0.007692 0.015335 0.012541 0.455301 0.430224 0.330933
N_BATHROOM 0.515433 0.002074 0.755214 1.000000 0.568725 0.013219 -0.012124 0.013076 0.007565 0.260299 0.256510 0.108884
N_ROOM 0.951271 0.002136 0.840357 0.568725 1.000000 0.016410 -0.007428 0.014618 0.014053 0.630801 0.533423 0.602637
QS_ROOMS 0.019720 0.001762 0.014750 0.013219 0.016410 1.000000 0.008809 0.008284 0.517387 0.019772 0.008976 0.021960
QS_BATHROOM -0.008168 -0.029529 -0.007692 -0.012124 -0.007428 0.008809 1.000000 -0.011646 0.551877 -0.006705 -0.000585 -0.011361
QS_BEDROOM 0.008385 0.001890 0.015335 0.013076 0.014618 0.008284 -0.011646 1.000000 0.630074 0.020903 0.019928 0.018430
QS_OVERALL 0.012163 -0.016771 0.012541 0.007565 0.014053 0.517387 0.551877 0.630074 1.000000 0.021101 0.016716 0.019786
REG_FEE 0.657435 0.011608 0.455301 0.260299 0.630801 0.019772 -0.006705 0.020903 0.021101 1.000000 0.660004 0.878126
COMMIS 0.571125 0.011218 0.430224 0.256510 0.533423 0.008976 -0.000585 0.019928 0.016716 0.660004 1.000000 0.626413
SALES_PRICE 0.612026 0.018679 0.330933 0.108884 0.602637 0.021960 -0.011361 0.018430 0.019786 0.878126 0.626413 1.000000
In [36]:
plt.figure(figsize=(12,6))
sns.heatmap(numeric_df.corr(),annot=True,cmap='GnBu')
plt.title('Correlation of Numeric Column')
plt.show()
Image
In [37]:
df.head(5)
Out[37]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 3210 Karapakkam 1004 2011-05-04 131 1 1 3 AbNormal Yes 1967-05-15 Commercial AllPub Paved A 4.0 3.9 4.9 4.330 380000 144400 7600000
1 9411 Anna Nagar 1986 2006-12-19 26 2 1 5 AbNormal No 1995-12-22 Commercial AllPub Gravel RH 4.9 4.2 2.5 3.765 760122 304049 21717770
2 1812 Adyar 909 2012-02-04 70 1 1 3 AbNormal Yes 1992-02-09 Commercial ELO Gravel RL 4.1 3.8 2.2 3.090 421094 92114 13159200
3 5346 Velachery 1855 2010-03-13 14 3 2 5 Family No 1988-03-18 Others NoSewr Paved I 4.7 3.9 3.6 4.010 356321 77042 9630290
4 6210 Karapakkam 1226 2009-10-05 84 1 1 3 AbNormal Yes 1979-10-13 Others AllPub Gravel C 3.0 2.5 4.1 3.290 237000 74063 7406250
In [38]:
# Droping Unnecessary Columns based on our insights
df.drop(['DATE_SALE', 'DATE_BUILD', 'DIST_MAINROAD', 'SALE_COND','UTILITY_AVAIL', 'STREET',
         'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM', 'QS_OVERALL', 'REG_FEE', 'COMMIS'], axis=1, inplace=True)
df
Out[38]:
PRT_ID AREA INT_SQFT N_BEDROOM N_BATHROOM N_ROOM PARK_FACIL BUILDTYPE MZZONE SALES_PRICE
0 3210 Karapakkam 1004 1 1 3 Yes Commercial A 7600000
1 9411 Anna Nagar 1986 2 1 5 No Commercial RH 21717770
2 1812 Adyar 909 1 1 3 Yes Commercial RL 13159200
3 5346 Velachery 1855 3 2 5 No Others I 9630290
4 6210 Karapakkam 1226 1 1 3 Yes Others C 7406250
... ... ... ... ... ... ... ... ... ... ...
7104 3834 Karapakkam 598 1 1 2 No Others RM 5353000
7105 10000 Velachery 1897 3 2 5 Yes Others RH 10818480
7106 9594 Velachery 1614 2 1 4 No House I 8351410
7107 6508 Karapakkam 787 1 1 2 Yes Commercial RL 8507000
7108 9794 Velachery 1896 3 2 5 Yes Others I 9976480

7103 rows × 10 columns

In [39]:
# Creating a backup file
df_bk=df.copy()
In [40]:
df_bk
Out[40]:
PRT_ID AREA INT_SQFT N_BEDROOM N_BATHROOM N_ROOM PARK_FACIL BUILDTYPE MZZONE SALES_PRICE
0 3210 Karapakkam 1004 1 1 3 Yes Commercial A 7600000
1 9411 Anna Nagar 1986 2 1 5 No Commercial RH 21717770
2 1812 Adyar 909 1 1 3 Yes Commercial RL 13159200
3 5346 Velachery 1855 3 2 5 No Others I 9630290
4 6210 Karapakkam 1226 1 1 3 Yes Others C 7406250
... ... ... ... ... ... ... ... ... ... ...
7104 3834 Karapakkam 598 1 1 2 No Others RM 5353000
7105 10000 Velachery 1897 3 2 5 Yes Others RH 10818480
7106 9594 Velachery 1614 2 1 4 No House I 8351410
7107 6508 Karapakkam 787 1 1 2 Yes Commercial RL 8507000
7108 9794 Velachery 1896 3 2 5 Yes Others I 9976480

7103 rows × 10 columns

Coverting the labels into a numeric form using Label Encoder¶

In [41]:
from sklearn.preprocessing import LabelEncoder
encoders = {}
original_categories = {}

# Iterate over each column
for col in df.select_dtypes(include='object').columns:
    # Create a LabelEncoder object
    encoders[col] = LabelEncoder()
    # Fit and transform the data for each column
    df[col] = encoders[col].fit_transform(df[col])
    # Store the original categories
    original_categories[col] = encoders[col].classes_

    # Print original categories and their corresponding encoded values
    print(f"Column: {col}")
    for category, encoded_value in zip(original_categories[col], encoders[col].transform(original_categories[col])):
        print(f"  Original Category: {category}  Encoded Value: {encoded_value}")
Column: AREA
  Original Category: Adyar  Encoded Value: 0
  Original Category: Anna Nagar  Encoded Value: 1
  Original Category: Chromepet  Encoded Value: 2
  Original Category: KK Nagar  Encoded Value: 3
  Original Category: Karapakkam  Encoded Value: 4
  Original Category: T Nagar  Encoded Value: 5
  Original Category: Velachery  Encoded Value: 6
Column: PARK_FACIL
  Original Category: No  Encoded Value: 0
  Original Category: Yes  Encoded Value: 1
Column: BUILDTYPE
  Original Category: Commercial  Encoded Value: 0
  Original Category: House  Encoded Value: 1
  Original Category: Others  Encoded Value: 2
Column: MZZONE
  Original Category: A  Encoded Value: 0
  Original Category: C  Encoded Value: 1
  Original Category: I  Encoded Value: 2
  Original Category: RH  Encoded Value: 3
  Original Category: RL  Encoded Value: 4
  Original Category: RM  Encoded Value: 5
In [42]:
df.head()
Out[42]:
PRT_ID AREA INT_SQFT N_BEDROOM N_BATHROOM N_ROOM PARK_FACIL BUILDTYPE MZZONE SALES_PRICE
0 3210 4 1004 1 1 3 1 0 0 7600000
1 9411 1 1986 2 1 5 0 0 3 21717770
2 1812 0 909 1 1 3 1 0 4 13159200
3 5346 6 1855 3 2 5 0 2 2 9630290
4 6210 4 1226 1 1 3 1 2 1 7406250
In [43]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7103 entries, 0 to 7108
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PRT_ID       7103 non-null   int64
 1   AREA         7103 non-null   int32
 2   INT_SQFT     7103 non-null   int64
 3   N_BEDROOM    7103 non-null   int64
 4   N_BATHROOM   7103 non-null   int64
 5   N_ROOM       7103 non-null   int64
 6   PARK_FACIL   7103 non-null   int32
 7   BUILDTYPE    7103 non-null   int32
 8   MZZONE       7103 non-null   int32
 9   SALES_PRICE  7103 non-null   int64
dtypes: int32(4), int64(6)
memory usage: 499.4 KB
In [44]:
# storing the Dependent Variables in X and Independent Variable in Y
x=df.drop('SALES_PRICE',axis=1)
y=df['SALES_PRICE']
In [45]:
# Splitting the Data into Training set and Testing Set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape
Out[45]:
((4972, 9), (2131, 9), (4972,), (2131,))
In [46]:
# Scaling the values to convert the int values to Machine Languages
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
x_train=mmscaler.fit_transform(x_train)
x_test=mmscaler.fit_transform(x_test)
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)
In [47]:
# Creating a Dataframe to store the metrics score
a={'Model Name':[], 'Mean_Absolute_Error MAE':[] ,'Mean_Absolute_Percentage_Error MAPE':[] ,'Mean_Squared_Error MSE':[],'Root_Mean_Squared_Error RMSE':[] ,'Root_Mean_Squared_Log_Error RMSLE':[] ,'R2 score':[],'Adj_R_Square':[]}
Results=pd.DataFrame(a)
Results.head()
Out[47]:
Model Name Mean_Absolute_Error MAE Mean_Absolute_Percentage_Error MAPE Mean_Squared_Error MSE Root_Mean_Squared_Error RMSE Root_Mean_Squared_Log_Error RMSLE R2 score Adj_R_Square
In [48]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
In [49]:
# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelSVR = SVR()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()
modelGBR = GradientBoostingRegressor()
In [50]:
MM = [modelmlg, modeldcr, modelrfr, modelKNN, modelETR, modelGBR, modelXGR]

for models in MM:
    
    # Fit the model with train data
    models.fit(x_train, y_train)
    
    # Predict the model with test data
    y_pred = models.predict(x_test)
    
    # Print the model name
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
    from sklearn import metrics

    # Assuming y_true are the actual values and y_pred are the predicted values
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    rmsle = np.log(rmse)
    r_squared = r2_score(y_test, y_pred)

    # Adjusted R-squared
    n = len(y_test)
    p = x.shape[1]  # Number of features
    adj_r_squared = 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")
    print(f"R-squared (R^2): {r_squared}")
    print(f"Adjusted R-squared: {adj_r_squared}")
    print('------------------------------------------------------------------------------------------------------------')
    
#-------------------------------------------------------------------------------------------
    new_row = pd.DataFrame({'Model Name': [str(models)],
                            'Mean_Absolute_Error MAE': [metrics.mean_absolute_error(y_test, y_pred)],
                            'Mean_Absolute_Percentage_Error MAPE': [np.mean(np.abs((y_test - y_pred) / y_test)) * 100],
                            'Mean_Squared_Error MSE': [metrics.mean_squared_error(y_test, y_pred)],
                            'Root_Mean_Squared_Error RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))],
                            'Root_Mean_Squared_Log_Error RMSLE': [np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))],
                            'R2 score': [metrics.r2_score(y_test, y_pred)],
                            'Adj_R_Square': [adj_r_squared]})

    # Append new_row to Results
    Results = pd.concat([Results, new_row], ignore_index=True)
Model Name:  LinearRegression()
Mean Absolute Error (MAE): 1425145.8457698394
Mean Absolute Percentage Error (MAPE): 14.474119138246595
Mean Squared Error (MSE): 3036548275910.5264
Root Mean Squared Error (RMSE): 1742569.4465101028
Root Mean Squared Log Error (RMSLE): 14.370871275347811
R-squared (R^2): 0.7865146189458103
Adjusted R-squared: 0.7856087403840528
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 696385.4856874706
Mean Absolute Percentage Error (MAPE): 7.363178860245062
Mean Squared Error (MSE): 794892032546.6682
Root Mean Squared Error (RMSE): 891567.1778092036
Root Mean Squared Log Error (RMSLE): 13.700736067130293
R-squared (R^2): 0.9441148919609125
Adjusted R-squared: 0.9438777557174651
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 530397.0632801501
Mean Absolute Percentage Error (MAPE): 5.670921520716918
Mean Squared Error (MSE): 441589767563.65186
Root Mean Squared Error (RMSE): 664522.2099852283
Root Mean Squared Log Error (RMSLE): 13.406823580181438
R-squared (R^2): 0.9689539071234792
Adjusted R-squared: 0.9688221698128292
------------------------------------------------------------------------------------------------------------
Model Name:  KNeighborsRegressor()
Mean Absolute Error (MAE): 598438.4739558892
Mean Absolute Percentage Error (MAPE): 6.2610300515341635
Mean Squared Error (MSE): 561350972685.7156
Root Mean Squared Error (RMSE): 749233.5902011573
Root Mean Squared Log Error (RMSLE): 13.526806083305642
R-squared (R^2): 0.9605340618953225
Adjusted R-squared: 0.9603665968114271
------------------------------------------------------------------------------------------------------------
Model Name:  ExtraTreesRegressor()
Mean Absolute Error (MAE): 541174.539183482
Mean Absolute Percentage Error (MAPE): 5.797972934424993
Mean Squared Error (MSE): 452729270159.7874
Root Mean Squared Error (RMSE): 672851.5959405814
Root Mean Squared Log Error (RMSLE): 13.419280073078268
R-squared (R^2): 0.9681707412586859
Adjusted R-squared: 0.968035680754833
------------------------------------------------------------------------------------------------------------
Model Name:  GradientBoostingRegressor()
Mean Absolute Error (MAE): 519218.5429586458
Mean Absolute Percentage Error (MAPE): 5.435603510333493
Mean Squared Error (MSE): 407033906383.53424
Root Mean Squared Error (RMSE): 637992.0895932287
Root Mean Squared Log Error (RMSLE): 13.366081163493368
R-squared (R^2): 0.9713833666681265
Adjusted R-squared: 0.9712619382381469
------------------------------------------------------------------------------------------------------------
Model Name:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
Mean Absolute Error (MAE): 522382.30091506336
Mean Absolute Percentage Error (MAPE): 5.533325394683955
Mean Squared Error (MSE): 422838835481.29785
Root Mean Squared Error (RMSE): 650260.5904414767
Root Mean Squared Log Error (RMSLE): 13.385128469901034
R-squared (R^2): 0.9702721966802367
Adjusted R-squared: 0.9701460532432362
------------------------------------------------------------------------------------------------------------
In [51]:
Results
Out[51]:
Model Name Mean_Absolute_Error MAE Mean_Absolute_Percentage_Error MAPE Mean_Squared_Error MSE Root_Mean_Squared_Error RMSE Root_Mean_Squared_Log_Error RMSLE R2 score Adj_R_Square
0 LinearRegression() 1.425146e+06 14.474119 3.036548e+12 1.742569e+06 14.370871 0.786515 0.785609
1 DecisionTreeRegressor() 6.963855e+05 7.363179 7.948920e+11 8.915672e+05 13.700736 0.944115 0.943878
2 RandomForestRegressor() 5.303971e+05 5.670922 4.415898e+11 6.645222e+05 13.406824 0.968954 0.968822
3 KNeighborsRegressor() 5.984385e+05 6.261030 5.613510e+11 7.492336e+05 13.526806 0.960534 0.960367
4 ExtraTreesRegressor() 5.411745e+05 5.797973 4.527293e+11 6.728516e+05 13.419280 0.968171 0.968036
5 GradientBoostingRegressor() 5.192185e+05 5.435604 4.070339e+11 6.379921e+05 13.366081 0.971383 0.971262
6 XGBRegressor(base_score=None, booster=None, ca... 5.223823e+05 5.533325 4.228388e+11 6.502606e+05 13.385128 0.970272 0.970146

From the above Results, the Top 3 Models by Comparing Adjacent R Square Values are¶

  • XGBRegressor
  • GradientBoostingRegressor
  • ExtraTreesRegressor

Training and Predicting with XGBRegressor

In [52]:
# Training the Model
modelXGR.fit(x_train, y_train)
    
# Predict the model with test data
y_pred = modelXGR.predict(x_test)
In [53]:
out=pd.DataFrame({'Price_actual':y_test,'Price_pred':y_pred})
result=df_bk.merge(out,left_index=True,right_index=True)
In [54]:
result
Out[54]:
PRT_ID AREA INT_SQFT N_BEDROOM N_BATHROOM N_ROOM PARK_FACIL BUILDTYPE MZZONE SALES_PRICE Price_actual Price_pred
8 3377 Chromepet 771 1 1 2 No Others RM 8308970 8308970 8097472.0
14 4085 Velachery 1865 3 2 5 No Commercial RM 15499680 15499680 16066729.0
15 6328 Velachery 1868 3 2 5 No Commercial RH 15714080 15714080 15024930.0
17 2016 Chromepet 796 1 1 2 Yes Commercial RL 10912550 10912550 11018991.0
19 1372 Anna Nagar 1902 2 1 5 Yes Commercial RL 21203240 21203240 21672568.0
... ... ... ... ... ... ... ... ... ... ... ... ...
7098 629 Anna Nagar 1611 1 1 4 Yes Others RM 13944780 13944780 14169602.0
7100 5438 T Nagar 1733 1 1 4 Yes Commercial RL 19501600 19501600 20383738.0
7102 5560 Karapakkam 701 1 1 2 No House RH 5643500 5643500 5284015.0
7103 5133 Karapakkam 1462 2 2 4 No Others RM 9387250 9387250 8950791.0
7104 3834 Karapakkam 598 1 1 2 No Others RM 5353000 5353000 6375991.5

2131 rows × 12 columns

In [55]:
result[['PRT_ID','AREA','Price_actual','Price_pred']].sample(20)
Out[55]:
PRT_ID AREA Price_actual Price_pred
6568 9292 Chromepet 10001300 10726214.0
3856 8382 Velachery 13822920 13961900.0
3317 958 Chromepet 10847700 10797060.0
6199 6250 Chromepet 9544150 9912693.0
3361 8711 Chromepet 8111250 7270345.0
6342 5109 Karapakkam 5874250 5773031.0
3528 6073 Karapakkam 6434750 6807893.0
2628 7164 Anna Nagar 19147580 18473892.0
5712 5187 Adyar 8557760 8687759.0
5905 9668 Velachery 12498160 12269813.0
293 4821 Karapakkam 7849250 7778017.5
1182 8028 Adyar 9310300 8240811.5
4957 8500 Anna Nagar 15324610 14858333.0
5650 4893 T Nagar 13081220 13594749.0
1695 7086 T Nagar 20747530 22015104.0
4048 9502 Chromepet 9847570 9511261.0
5048 3936 KK Nagar 14373860 14100759.0
4844 5229 Chromepet 9688320 10237972.0
3425 8582 Adyar 7811430 8169637.0
4488 3333 KK Nagar 11131800 11805751.0
In [56]:
px.scatter(result, x='Price_actual', y='Price_pred', trendline='ols', color_discrete_sequence=['magenta'],
           template='plotly_dark', title='<b> Actual Price  Vs  Predicted Price ')
In [ ]: